{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 案例：给用户推荐电影"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.io as sio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mat = sio.loadmat('data/ex8_movies.mat')\n",
    "mat.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1682, 943), (1682, 943))"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y,R = mat['Y'],mat['R']\n",
    "Y.shape,R.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param_mat =sio.loadmat('data/ex8_movieParams.mat')\n",
    "param_mat.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X,Theta,nu,nm,nf = param_mat['X'],param_mat['Theta'],param_mat['num_users'],param_mat['num_movies'],param_mat['num_features']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1682, 10),\n",
       " (943, 10),\n",
       " array([[943]], dtype=uint16),\n",
       " array([[1682]], dtype=uint16),\n",
       " array([[10]], dtype=uint8))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape,Theta.shape,nu,nm,nf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(943, 1682, 10)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nu = int(nu)\n",
    "nm = int(nm)\n",
    "nf = int(nf)\n",
    "nu,nm,nf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 序列化参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def serialize(X,Theta):\n",
    "    \n",
    "    return np.append(X.flatten(),Theta.flatten())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 解序列化参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def deserialize(params,nm,nu,nf):\n",
    "    X = params[:nm*nf].reshape(nm,nf)\n",
    "    Theta = params[nm*nf:].reshape(nu,nf)\n",
    "    return X,Theta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 代价函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def costFunction(params,Y,R,nm,nu,nf,lamda):\n",
    "    X,Theta = deserialize(params,nm,nu,nf)\n",
    "    error = 0.5 * np.square((X@Theta.T - Y)* R).sum()\n",
    "    reg1 = 0.5 * lamda * np.square(X).sum()\n",
    "    reg2 = 0.5 * lamda * np.square(Theta).sum()\n",
    "    return error + reg1 + reg2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22.224603725685675"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users = 4\n",
    "movies = 5\n",
    "features = 3\n",
    "X_sub = X[:movies,:features]\n",
    "Theta_sub = Theta[:users,:features]\n",
    "Y_sub = Y[:movies,:users]\n",
    "R_sub = R[:movies,:users]\n",
    "cost1 = costFunction(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda = 0)\n",
    "cost1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31.344056244274217"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cost2 = costFunction(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda = 1.5)\n",
    "cost2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.梯度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def costGradient(params,Y,R,nm,nu,nf,lamda):\n",
    "    X,Theta = deserialize(params,nm,nu,nf)\n",
    "    X_grad = ((X@Theta.T-Y)*R)@Theta +lamda * X\n",
    "    Theta_grad = ((X@Theta.T-Y)*R).T@X + lamda * Theta\n",
    "    return serialize(X_grad,Theta_grad)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.添加一个新用户"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_ratings = np.zeros((nm,1))\n",
    "my_ratings[9]   = 5\n",
    "my_ratings[66]  = 5\n",
    "my_ratings[96]   = 5\n",
    "my_ratings[121]  = 4\n",
    "my_ratings[148]  = 4\n",
    "my_ratings[285]  = 3\n",
    "my_ratings[490]  = 4\n",
    "my_ratings[599]  = 4\n",
    "my_ratings[643] = 4\n",
    "my_ratings[958] = 5\n",
    "my_ratings[1117] = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = np.c_[Y,my_ratings]\n",
    "R = np.c_[R,my_ratings!=0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1682, 944)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y.shape\n",
    "R.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "nm,nu = Y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.均值归一化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalizeRatings(Y,R):\n",
    "    Y_mean =(Y.sum(axis=1) / R.sum(axis=1)).reshape(-1,1)\n",
    "    Y_norm = (Y - Y_mean) * R\n",
    "    return Y_norm,Y_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_norm,Y_mean = normalizeRatings(Y,R)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7.参数初始化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = np.random.random((nm,nf))\n",
    "Theta = np.random.random((nu,nf))\n",
    "params = serialize(X,Theta)\n",
    "lamda = 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. 模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from scipy.optimize import minimize\n",
    "res = minimize(fun = costFunction,\n",
    "        x0 = params,\n",
    "        args = (Y_norm,R,nm,nu,nf,lamda),\n",
    "        method = 'TNC',\n",
    "        jac = costGradient,\n",
    "        options = {'maxiter':100})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "params_fit = res.x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_X,fit_Theta = deserialize(params_fit,nm,nu,nf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9.预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_pred = fit_X@fit_Theta.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = Y_pred[:,-1] + Y_mean.flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.argsort(-y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1466, 1292, 1200, 1499, 1121, 1652,  813, 1598, 1188, 1535])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies = []\n",
    "with open('data/movie_ids.txt','r',encoding='latin 1') as f:\n",
    "    for line in f:\n",
    "        tokens = line.strip().split(' ')\n",
    "        movies.append(' '.join(tokens[1:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1682"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(movies)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1466 Saint of Fort Washington, The (1993) 5.001810464524898\n",
      "1292 Star Kid (1997) 5.001694543358842\n",
      "1200 Marlene Dietrich: Shadow and Light (1996) 5.00149967272587\n",
      "1499 Santa with Muscles (1996) 5.001334854576942\n",
      "1121 They Made Me a Criminal (1939) 5.000999967820586\n",
      "1652 Entertaining Angels: The Dorothy Day Story (1996) 5.000975637072753\n",
      "813 Great Day in Harlem, A (1994) 5.000799086576183\n",
      "1598 Someone Else's America (1995) 5.0006016104689905\n",
      "1188 Prefontaine (1997) 5.000467537343162\n",
      "1535 Aiqing wansui (1994) 5.000239077425466\n"
     ]
    }
   ],
   "source": [
    "for i in range(10):\n",
    "    print(index[i],movies[index[i]],y_pred[index[i]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
